In [ ]:
import warnings
warnings.filterwarnings("ignore")
from torchmetrics.classification import BinaryF1Score as F1Score
import matplotlib.pyplot as plt
from anomalib.models import Padim
from anomalib.engine import Engine
from anomalib.data import PredictDataset, MVTec
from anomalib import TaskType
from anomalib.deploy import ExportType, OpenVINOInferencer, CompressionType
from anomalib.utils.visualization.image import ImageVisualizer, VisualizationMode, ImageResult
from anomalib.metrics import AUROC
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast
from PIL import Image
from pynvml.smi import nvidia_smi
import numpy as np
from pathlib import Path
from timeit import default_timer as timer
import re
import sys
import psutil
import shutil
sys.path.append("anomalib_trt_python")
try:
from anomalib_trt_python.trt_inferencer import TrtInferencer
except RuntimeError:
print("You need a CUDA device to run this script.")
import json
import torch
torch.set_float32_matmul_precision('high')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nvsmi = nvidia_smi.getInstance()
print(f"Using device: {device}")
print(torch.cuda.get_device_name(0))
task = TaskType.SEGMENTATION
Using device: cuda NVIDIA GeForce RTX 4070 Laptop GPU
In [ ]:
# Choose a category
categories = ['transistor', 'bottle', 'carpet', 'hazelnut', 'screw']
category = categories[1] # bottle
# Choose a test folder for inference
specific_folder = "broken_large"
inference_path = Path(f'MVTec_test/{category}/test/{specific_folder}')
# Create folder for category
import os
if not os.path.exists(f"./{category}"):
os.makedirs(f"./{category}")
In [ ]:
def test_model(engine, data, inference, model=None):
img_visualizer = ImageVisualizer(mode=VisualizationMode.FULL, task=task)
output_images = []
total_time = 0
inferences = ['onnx', 'trt', 'torch_fp32', 'torch_fp16', 'openvino']
if inference not in inferences:
raise ValueError(f'Invalid inference type. Choose from {inferences}')
if inference == 'onnx' or inference == 'trt' or inference == 'openvino':
for img_path in data:
time_0 = timer()
pred_img_result = engine.predict(img_path)
time_1 = timer()
if inference != 'trt':
max_memory_allocated = psutil.virtual_memory().used
else:
max_memory_allocated = nvsmi.DeviceQuery('memory.used')
max_memory_allocated = max_memory_allocated['gpu'][0]['fb_memory_usage']['used']
inference_time = time_1 - time_0
total_time += inference_time
gt = re.sub(r'\btest\b', 'ground_truth', img_path)
gt = re.sub(r'\b.png\b', '_mask.png', gt)
if pred_img_result.image.shape[:2] == (256, 256):
gt_img = np.array(Image.open(gt).resize((256, 256)))
else:
gt_img = np.array(Image.open(gt))
pred_img_result.gt_mask = gt_img
output_images.append((img_visualizer.visualize_image(pred_img_result), pred_img_result))
else:
# data is a dataloader
if model is None:
raise ValueError('Model is required for torch inference')
model.eval()
model.to(device)
if inference == 'torch_fp16':
with autocast():
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
time_0 = timer()
predictions = engine.predict(model, data)
time_1 = timer()
max_memory_allocated = torch.cuda.max_memory_allocated(device)
total_time = time_1 - time_0
else:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
time_0 = timer()
predictions = engine.predict(model, data)
time_1 = timer()
max_memory_allocated = torch.cuda.max_memory_allocated(device)
total_time = time_1 - time_0
for i in range(len(predictions)):
pred = predictions[i]
image_path = pred["image_path"][0]
image_size = pred["image"].shape[-2:]
image = np.array(Image.open(image_path).resize(image_size))
anomaly_map = pred["anomaly_maps"][0]
anomaly_map = anomaly_map.cpu().numpy().squeeze()
gt = re.sub(r'\btest\b', 'ground_truth', image_path)
gt = re.sub(r'\b.png\b', '_mask.png', gt)
visualizer = ImageVisualizer(mode=VisualizationMode.FULL, task=task)
image_result = ImageResult(
image=image,
pred_score=pred["pred_scores"][0].cpu().numpy().item() if "pred_scores" in pred else None,
pred_label=pred["pred_labels"][0].cpu().numpy().item() if "pred_labels" in pred else None,
anomaly_map=pred["anomaly_maps"][0].cpu().numpy() if "anomaly_maps" in pred else None,
pred_mask=pred["pred_masks"][0].squeeze().int().cpu().numpy() if "pred_masks" in pred else None,
gt_mask=pred["mask"][0].squeeze().int().cpu().numpy() if "mask" in pred else None,
gt_boxes=pred["boxes"][0].cpu().numpy() if "boxes" in pred else None,
pred_boxes=pred["pred_boxes"][0].cpu().numpy() if "pred_boxes" in pred else None,
box_labels=pred["box_labels"][0].cpu().numpy() if "box_labels" in pred else None,
)
gt_img_np = np.array(Image.open(gt).resize(image_size))
image_result.gt_mask = gt_img_np
output_images.append((visualizer.visualize_image(image_result), image_result))
sum_f1_pixel = 0
sum_f1_img = 0
sum_auroc_pixel = 0
sum_auroc_img = 0
fig, ax = plt.subplots(len(output_images), 1, figsize=(20, 20))
i = 0
for img, pred_img_result in output_images:
if len(output_images) > 1:
ax[i].imshow(img)
ax[i].axis('off')
i += 1
else:
ax.imshow(img)
ax.axis('off')
auroc = AUROC().to(device)
pred_mask = torch.from_numpy(pred_img_result.pred_mask).to(device)
gt_mask = torch.from_numpy(pred_img_result.gt_mask).to(device)
image_scores = torch.mean(pred_mask.view(pred_mask.shape[0], -1).float(), dim=1)
gt_image_labels = torch.any(gt_mask.view(gt_mask.shape[0], -1) > 0, dim=1).long()
img_lvl_auroc = auroc(image_scores, gt_image_labels)
gt_mask = torch.where(gt_mask > 0, torch.tensor(1, device=gt_mask.device), torch.tensor(0, device=gt_mask.device))
pred_mask_flat = pred_mask.view(-1).float()
gt_mask_flat = gt_mask.view(-1).long()
pixel_lvl_auroc = auroc(pred_mask_flat, gt_mask_flat)
f1_score = F1Score().to(device)
pixel_lvl_f1 = f1_score(pred_mask_flat, gt_mask_flat)
img_lvl_f1 = f1_score(image_scores, gt_image_labels)
sum_f1_pixel += pixel_lvl_f1
sum_f1_img += img_lvl_f1
sum_auroc_pixel += pixel_lvl_auroc
sum_auroc_img += img_lvl_auroc
print(f'Average pixel-level F1 score: {sum_f1_pixel / len(output_images)}')
print(f'Average image-level F1 score: {sum_f1_img / len(output_images)}')
print(f'Average pixel-level AUROC: {sum_auroc_pixel / len(output_images)}')
print(f'Average image-level AUROC: {sum_auroc_img / len(output_images)}')
output_path = Path(f'./{category}/output_{inference}.png')
fig.tight_layout()
plt.savefig(output_path, bbox_inches='tight', transparent=True, pad_inches=0)
return output_path, total_time, max_memory_allocated, output_images
In [ ]:
# Torch FP32 model
model_fp32 = Padim(backbone='wide_resnet50_2')
engine_fp32 = Engine()
fit = False # Set to True if you want to retrain the model, if False and model exists, it will be loaded from disk
if Path(f"./{category}/padim_fp32_torch.pth").exists() and not fit:
model_fp32.load_state_dict(torch.load(f"./{category}/padim_fp32_torch.pth"))
print("Loaded model from disk")
else:
dataset_fit = Path("./MVTec_fit")
datamodule_fp32 = MVTec(
root=dataset_fit,
category=category,
image_size=256,
train_batch_size=32,
eval_batch_size=32,
num_workers=4,
task=task
)
# Train the model
engine_fp32.fit(datamodule=datamodule_fp32, model=model_fp32)
print("Model trained")
# Save the fp32 model
torch.save(model_fp32.state_dict(), f"./{category}/padim_fp32_torch.pth")
print("Model saved to disk")
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback. GPU available: True (cuda), used: True TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs `Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch.. F1Score class exists for backwards compatibility. It will be removed in v1.1. Please use BinaryF1Score from torchmetrics instead F1Score class exists for backwards compatibility. It will be removed in v1.1. Please use BinaryF1Score from torchmetrics instead LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓ ┃ ┃ Name ┃ Type ┃ Params ┃ Mode ┃ ┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩ │ 0 │ model │ PadimModel │ 24.9 M │ train │ │ 1 │ _transform │ Compose │ 0 │ train │ │ 2 │ normalization_metrics │ MinMax │ 0 │ train │ │ 3 │ image_threshold │ F1AdaptiveThreshold │ 0 │ train │ │ 4 │ pixel_threshold │ F1AdaptiveThreshold │ 0 │ train │ │ 5 │ image_metrics │ AnomalibMetricCollection │ 0 │ train │ │ 6 │ pixel_metrics │ AnomalibMetricCollection │ 0 │ train │ └───┴───────────────────────┴──────────────────────────┴────────┴───────┘
Trainable params: 24.9 M Non-trainable params: 0 Total params: 24.9 M Total estimated model params size (MB): 99
Output()
`Trainer.fit` stopped: `max_epochs=1` reached.
Model trained Model saved to disk
In [ ]:
# Inference FP32 model
inference_dataset_fp32 = PredictDataset(path=inference_path)
inference_dataloader_fp32 = DataLoader(dataset=inference_dataset_fp32)
output_path_FP32, inference_time_FP32, memory_FP32, fp32_output_images = test_model(engine_fp32, inference_dataloader_fp32, 'torch_fp32', model=model_fp32)
print(f"Time taken for inference (FP32): {inference_time_FP32:.2f} seconds, memory used: {memory_FP32 / 1024**3:.2f} GB")
Image.open(output_path_FP32)
ckpt_path is not provided. Model weights will not be loaded. F1Score class exists for backwards compatibility. It will be removed in v1.1. Please use BinaryF1Score from torchmetrics instead F1Score class exists for backwards compatibility. It will be removed in v1.1. Please use BinaryF1Score from torchmetrics instead LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Output()
Average pixel-level F1 score: 0.748151957988739 Average image-level F1 score: 0.9572950601577759 Average pixel-level AUROC: 0.9305204749107361 Average image-level AUROC: 0.9679543375968933 Time taken for inference (FP32): 2.23 seconds, memory used: 4.83 GB
Out[ ]:
In [ ]:
# Torch FP16 model
#
# Load the FP32 model and convert it to FP16
# If the FP16 model already exists and fit is False, load it from disk
if Path(f"./{category}/padim_fp16_torch.pth").exists() and not fit:
model_fp16 = Padim(backbone='wide_resnet50_2')
model_fp16.load_state_dict(torch.load(f"./{category}/padim_fp16_torch.pth"))
print("Loaded model from disk")
# Convert the FP32 model to FP16 and save it to disk if it doesn't exist or fit is True
else:
model_fp32_to_16 = Padim(backbone='wide_resnet50_2')
model_fp32_to_16.load_state_dict(torch.load(f"./{category}/padim_fp32_torch.pth"))
model_fp16 = model_fp32_to_16.half()
print("Converted the FP32 model and saved it to disk")
# Save the FP16 model
torch.save(model_fp16.state_dict(), f"./{category}/padim_fp16_torch.pth")
Converted the FP32 model and saved it to disk
In [ ]:
engine_fp16 = Engine()
# Inference FP16 model
inference_dataset_fp16 = PredictDataset(path=inference_path)
inference_dataloader_fp16 = DataLoader(dataset=inference_dataset_fp16)
output_path_fp16, inference_time_fp16, memory_fp16, fp_16_output_images = test_model(engine_fp16, inference_dataloader_fp16, 'torch_fp16', model_fp16)
print(f"Time taken for inference: {inference_time_fp16:.2f} seconds, memory used: {memory_fp16 / 1024**3:.2f} GB")
Image.open(output_path_fp16)
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback. GPU available: True (cuda), used: True TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs `Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch.. ckpt_path is not provided. Model weights will not be loaded. F1Score class exists for backwards compatibility. It will be removed in v1.1. Please use BinaryF1Score from torchmetrics instead F1Score class exists for backwards compatibility. It will be removed in v1.1. Please use BinaryF1Score from torchmetrics instead LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Output()
Average pixel-level F1 score: 0.7480775117874146 Average image-level F1 score: 0.9572950601577759 Average pixel-level AUROC: 0.930578887462616 Average image-level AUROC: 0.9679543375968933 Time taken for inference: 1.90 seconds, memory used: 2.40 GB
Out[ ]:
In [ ]:
# ONNX FP32 model
#
# Export from Torch FP32 to ONNX
# If the ONNX model already exists and fit is False, load it from disk
if Path(f"./{category}/padim_fp32_onnx/weights/onnx/model.onnx").exists() and not fit:
print("ONNX model already exists")
# Export the model to ONNX and save it to disk if it doesn't exist or fit is True
else:
model_fp32.eval()
# Delete the folder if it exists to avoid errors
if Path(f"./{category}/padim_fp32_onnx").exists():
shutil.rmtree(f"./{category}/padim_fp32_onnx")
engine_fp32.export(model=model_fp32, export_type=ExportType.ONNX, export_root=f'./{category}/padim_fp32_onnx', input_size=(256, 256))
print("ONNX model exported and saved to disk")
ONNX model exported and saved to disk
In [ ]:
# Inference ONNX model
inferencer_onnx = OpenVINOInferencer(
path=f"./{category}/padim_fp32_onnx/weights/onnx/model.onnx",
metadata=f"./{category}/padim_fp32_onnx/weights/onnx/metadata.json",
task=task
)
data = [str(img) for img in inference_path.glob('*.png')]
output_path_onnx, inference_time_onnx, memory_onnx, onnx_output_images = test_model(inferencer_onnx, data, 'onnx')
print(f"Inference time: {inference_time_onnx:.2f} seconds, Memory: {memory_onnx/1024**3:.2f} GB")
Image.open(output_path_onnx)
Average pixel-level F1 score: 0.7273775935173035 Average image-level F1 score: 0.9473565220832825 Average pixel-level AUROC: 0.928958535194397 Average image-level AUROC: 0.9539563059806824 Inference time: 0.59 seconds, Memory: 22.41 GB
Out[ ]:
In [ ]:
# OpenVINO FP16 model
#
# Export from Torch FP32 to OpenVINO
# If the OpenVINO model already exists and fit is False, load it from disk
if Path(f"./{category}/padim_fp16_openvino/weights/openvino/model.xml").exists() and not fit:
print("OpenVINO model already exists")
# Export the model to OpenVINO and save it to disk if it doesn't exist or fit is True
else:
model_fp32.eval()
if Path(f"./{category}/padim_fp16_openvino").exists():
shutil.rmtree(f"./{category}/padim_fp16_openvino")
engine_fp32.export(model=model_fp32, export_type=ExportType.OPENVINO, export_root=f'./{category}/padim_fp16_openvino', compression_type=CompressionType.FP16, input_size=(256, 256))
print("OpenVINO model exported and saved to disk")
OpenVINO model exported and saved to disk
In [ ]:
# Inference OpenVINO model
inferencer_openvino = OpenVINOInferencer(
path=f"./{category}/padim_fp16_openvino/weights/openvino/model.xml",
metadata=f"./{category}/padim_fp16_openvino/weights/openvino/metadata.json",
task=task,
)
data = [str(img) for img in inference_path.glob('*.png')]
output_path_openvino, inference_time_openvino, memory_openvino, openvino_output_images = test_model(inferencer_openvino, data, 'openvino')
print(f"Inference time: {inference_time_openvino:.2f} seconds, Memory: {memory_openvino / 1024**3:.2f} GB")
Image.open(output_path_openvino)
Average pixel-level F1 score: 0.7258498072624207 Average image-level F1 score: 0.9475395083427429 Average pixel-level AUROC: 0.9287086725234985 Average image-level AUROC: 0.9533352255821228 Inference time: 0.54 seconds, Memory: 17.95 GB
Out[ ]:
In [ ]:
# From ONNX to TensorRT
# If the TensorRT engine already exists and fit is False, load it from disk
if Path(f"./{category}/padim_fp32_trt.engine").exists() and not fit:
print("TensorRT engine already exists")
# Convert the ONNX model to TensorRT and save it to disk if it doesn't exist or fit is True
else:
onnx_path = f"./{category}/padim_fp32_onnx/weights/onnx/model.onnx"
engine_path = f"./{category}/padim_fp32_trt.engine"
!trtexec --onnx={onnx_path} --saveEngine={engine_path}
print("TensorRT engine created and saved to disk")
&&&& RUNNING TensorRT.trtexec [TensorRT v100100] # trtexec --onnx=./bottle/padim_fp32_onnx/weights/onnx/model.onnx --saveEngine=./bottle/padim_fp32_trt.engineTensorRT engine created and saved to disk
[07/04/2024-21:45:25] [I] === Model Options ===
[07/04/2024-21:45:25] [I] Format: ONNX
[07/04/2024-21:45:25] [I] Model: ./bottle/padim_fp32_onnx/weights/onnx/model.onnx
[07/04/2024-21:45:25] [I] Output:
[07/04/2024-21:45:25] [I] === Build Options ===
[07/04/2024-21:45:25] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default, tacticSharedMem: default
[07/04/2024-21:45:25] [I] avgTiming: 8
[07/04/2024-21:45:25] [I] Precision: FP32
[07/04/2024-21:45:25] [I] LayerPrecisions:
[07/04/2024-21:45:25] [I] Layer Device Types:
[07/04/2024-21:45:25] [I] Calibration:
[07/04/2024-21:45:25] [I] Refit: Disabled
[07/04/2024-21:45:25] [I] Strip weights: Disabled
[07/04/2024-21:45:25] [I] Version Compatible: Disabled
[07/04/2024-21:45:25] [I] ONNX Plugin InstanceNorm: Disabled
[07/04/2024-21:45:25] [I] TensorRT runtime: full
[07/04/2024-21:45:25] [I] Lean DLL Path:
[07/04/2024-21:45:25] [I] Tempfile Controls: { in_memory: allow, temporary: allow }
[07/04/2024-21:45:25] [I] Exclude Lean Runtime: Disabled
[07/04/2024-21:45:25] [I] Sparsity: Disabled
[07/04/2024-21:45:25] [I] Safe mode: Disabled
[07/04/2024-21:45:25] [I] Build DLA standalone loadable: Disabled
[07/04/2024-21:45:25] [I] Allow GPU fallback for DLA: Disabled
[07/04/2024-21:45:25] [I] DirectIO mode: Disabled
[07/04/2024-21:45:25] [I] Restricted mode: Disabled
[07/04/2024-21:45:25] [I] Skip inference: Disabled
[07/04/2024-21:45:25] [I] Save engine: ./bottle/padim_fp32_trt.engine
[07/04/2024-21:45:25] [I] Load engine:
[07/04/2024-21:45:25] [I] Profiling verbosity: 0
[07/04/2024-21:45:25] [I] Tactic sources: Using default tactic sources
[07/04/2024-21:45:25] [I] timingCacheMode: local
[07/04/2024-21:45:25] [I] timingCacheFile:
[07/04/2024-21:45:25] [I] Enable Compilation Cache: Enabled
[07/04/2024-21:45:25] [I] errorOnTimingCacheMiss: Disabled
[07/04/2024-21:45:25] [I] Preview Features: Use default preview flags.
[07/04/2024-21:45:25] [I] MaxAuxStreams: -1
[07/04/2024-21:45:25] [I] BuilderOptimizationLevel: -1
[07/04/2024-21:45:25] [I] Calibration Profile Index: 0
[07/04/2024-21:45:25] [I] Weight Streaming: Disabled
[07/04/2024-21:45:25] [I] Debug Tensors:
[07/04/2024-21:45:25] [I] Input(s)s format: fp32:CHW
[07/04/2024-21:45:25] [I] Output(s)s format: fp32:CHW
[07/04/2024-21:45:25] [I] Input build shapes: model
[07/04/2024-21:45:25] [I] Input calibration shapes: model
[07/04/2024-21:45:25] [I] === System Options ===
[07/04/2024-21:45:25] [I] Device: 0
[07/04/2024-21:45:25] [I] DLACore:
[07/04/2024-21:45:25] [I] Plugins:
[07/04/2024-21:45:25] [I] setPluginsToSerialize:
[07/04/2024-21:45:25] [I] dynamicPlugins:
[07/04/2024-21:45:25] [I] ignoreParsedPluginLibs: 0
[07/04/2024-21:45:25] [I]
[07/04/2024-21:45:25] [I] === Inference Options ===
[07/04/2024-21:45:25] [I] Batch: Explicit
[07/04/2024-21:45:25] [I] Input inference shapes: model
[07/04/2024-21:45:25] [I] Iterations: 10
[07/04/2024-21:45:25] [I] Duration: 3s (+ 200ms warm up)
[07/04/2024-21:45:25] [I] Sleep time: 0ms
[07/04/2024-21:45:25] [I] Idle time: 0ms
[07/04/2024-21:45:25] [I] Inference Streams: 1
[07/04/2024-21:45:25] [I] ExposeDMA: Disabled
[07/04/2024-21:45:25] [I] Data transfers: Enabled
[07/04/2024-21:45:25] [I] Spin-wait: Disabled
[07/04/2024-21:45:25] [I] Multithreading: Disabled
[07/04/2024-21:45:25] [I] CUDA Graph: Disabled
[07/04/2024-21:45:25] [I] Separate profiling: Disabled
[07/04/2024-21:45:25] [I] Time Deserialize: Disabled
[07/04/2024-21:45:25] [I] Time Refit: Disabled
[07/04/2024-21:45:25] [I] NVTX verbosity: 0
[07/04/2024-21:45:25] [I] Persistent Cache Ratio: 0
[07/04/2024-21:45:25] [I] Optimization Profile Index: 0
[07/04/2024-21:45:25] [I] Weight Streaming Budget: 100.000000%
[07/04/2024-21:45:25] [I] Inputs:
[07/04/2024-21:45:25] [I] Debug Tensor Save Destinations:
[07/04/2024-21:45:25] [I] === Reporting Options ===
[07/04/2024-21:45:25] [I] Verbose: Disabled
[07/04/2024-21:45:25] [I] Averages: 10 inferences
[07/04/2024-21:45:25] [I] Percentiles: 90,95,99
[07/04/2024-21:45:25] [I] Dump refittable layers:Disabled
[07/04/2024-21:45:25] [I] Dump output: Disabled
[07/04/2024-21:45:25] [I] Profile: Disabled
[07/04/2024-21:45:25] [I] Export timing to JSON file:
[07/04/2024-21:45:25] [I] Export output to JSON file:
[07/04/2024-21:45:25] [I] Export profile to JSON file:
[07/04/2024-21:45:25] [I]
[07/04/2024-21:45:25] [I] === Device Information ===
[07/04/2024-21:45:25] [I] Available Devices:
[07/04/2024-21:45:25] [I] Device 0: "NVIDIA GeForce RTX 4070 Laptop GPU" UUID: GPU-e58646d2-c4c6-37c2-ebcb-9018ebe88a47
[07/04/2024-21:45:25] [I] Selected Device: NVIDIA GeForce RTX 4070 Laptop GPU
[07/04/2024-21:45:25] [I] Selected Device ID: 0
[07/04/2024-21:45:25] [I] Selected Device UUID: GPU-e58646d2-c4c6-37c2-ebcb-9018ebe88a47
[07/04/2024-21:45:25] [I] Compute Capability: 8.9
[07/04/2024-21:45:25] [I] SMs: 36
[07/04/2024-21:45:25] [I] Device Global Memory: 8187 MiB
[07/04/2024-21:45:25] [I] Shared Memory per SM: 100 KiB
[07/04/2024-21:45:25] [I] Memory Bus Width: 128 bits (ECC disabled)
[07/04/2024-21:45:25] [I] Application Compute Clock Rate: 1.695 GHz
[07/04/2024-21:45:25] [I] Application Memory Clock Rate: 8.001 GHz
[07/04/2024-21:45:25] [I]
[07/04/2024-21:45:25] [I] Note: The application clock rates do not reflect the actual clock rates that the GPU is currently running at.
[07/04/2024-21:45:25] [I]
[07/04/2024-21:45:25] [I] TensorRT version: 10.1.0
[07/04/2024-21:45:25] [I] Loading standard plugins
[07/04/2024-21:45:25] [I] [TRT] [MemUsageChange] Init CUDA: CPU +86, GPU +0, now: CPU 18672, GPU 1131 (MiB)
[07/04/2024-21:45:32] [I] [TRT] [MemUsageChange] Init builder kernel library: CPU +2396, GPU +290, now: CPU 21387, GPU 1421 (MiB)
[07/04/2024-21:45:32] [I] Start parsing network model.
[07/04/2024-21:45:32] [I] [TRT] ----------------------------------------------------------------
[07/04/2024-21:45:32] [I] [TRT] Input filename: ./bottle/padim_fp32_onnx/weights/onnx/model.onnx
[07/04/2024-21:45:32] [I] [TRT] ONNX IR version: 0.0.7
[07/04/2024-21:45:32] [I] [TRT] Opset version: 14
[07/04/2024-21:45:32] [I] [TRT] Producer name: pytorch
[07/04/2024-21:45:32] [I] [TRT] Producer version: 2.3.1
[07/04/2024-21:45:32] [I] [TRT] Domain:
[07/04/2024-21:45:32] [I] [TRT] Model version: 0
[07/04/2024-21:45:32] [I] [TRT] Doc string:
[07/04/2024-21:45:32] [I] [TRT] ----------------------------------------------------------------
[07/04/2024-21:45:38] [I] Finished parsing network model. Parse time: 6.11513
[07/04/2024-21:45:38] [I] [TRT] Local timing cache in use. Profiling results in this builder pass will not be stored.
[07/04/2024-21:51:56] [I] [TRT] Detected 1 inputs and 1 output network tensors.
[07/04/2024-21:51:57] [I] [TRT] Total Host Persistent Memory: 284704
[07/04/2024-21:51:57] [I] [TRT] Total Device Persistent Memory: 4096
[07/04/2024-21:51:57] [I] [TRT] Total Scratch Memory: 18038784
[07/04/2024-21:51:57] [I] [TRT] [BlockAssignment] Started assigning block shifts. This will take 86 steps to complete.
[07/04/2024-21:51:57] [I] [TRT] [BlockAssignment] Algorithm ShiftNTopDown took 1.6149ms to assign 8 blocks to 86 nodes requiring 48449024 bytes.
[07/04/2024-21:51:57] [I] [TRT] Total Activation Memory: 48447488
[07/04/2024-21:51:57] [I] [TRT] Total Weights Memory: 5106776580
[07/04/2024-21:51:58] [I] [TRT] Engine generation completed in 379.121 seconds.
[07/04/2024-21:51:58] [I] [TRT] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 16 MiB, GPU 4871 MiB
[07/04/2024-21:51:59] [I] [TRT] [MemUsageStats] Peak memory usage during Engine building and serialization: CPU: 17062 MiB
[07/04/2024-21:51:59] [I] Engine built in 380.59 sec.
[07/04/2024-21:51:59] [I] Created engine with size: 4872.52 MiB
[07/04/2024-21:52:08] [I] [TRT] Loaded engine size: 4872 MiB
[07/04/2024-21:52:10] [I] Engine deserialized in 4.93558 sec.
[07/04/2024-21:52:10] [I] [TRT] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +46, now: CPU 0, GPU 4916 (MiB)
[07/04/2024-21:52:10] [I] Setting persistentCacheLimit to 0 bytes.
[07/04/2024-21:52:10] [I] Created execution context with device memory size: 46.2031 MiB
[07/04/2024-21:52:10] [I] Using random values for input input
[07/04/2024-21:52:10] [I] Input binding for input with dimensions 1x3x256x256 is created.
[07/04/2024-21:52:10] [I] Output binding for output with dimensions 1x1x256x256 is created.
[07/04/2024-21:52:10] [I] Starting inference
[07/04/2024-21:52:13] [I] Warmup completed 8 queries over 200 ms
[07/04/2024-21:52:13] [I] Timing trace has 125 queries over 3.07641 s
[07/04/2024-21:52:13] [I]
[07/04/2024-21:52:13] [I] === Trace details ===
[07/04/2024-21:52:13] [I] Trace averages of 10 runs:
[07/04/2024-21:52:13] [I] Average on 10 runs - GPU latency: 24.5723 ms - Host latency: 24.7014 ms (enqueue 0.5468 ms)
[07/04/2024-21:52:13] [I] Average on 10 runs - GPU latency: 24.5085 ms - Host latency: 24.6504 ms (enqueue 0.581119 ms)
[07/04/2024-21:52:13] [I] Average on 10 runs - GPU latency: 24.4414 ms - Host latency: 24.5819 ms (enqueue 0.577014 ms)
[07/04/2024-21:52:13] [I] Average on 10 runs - GPU latency: 24.5085 ms - Host latency: 24.6436 ms (enqueue 0.551141 ms)
[07/04/2024-21:52:13] [I] Average on 10 runs - GPU latency: 24.4224 ms - Host latency: 24.5595 ms (enqueue 0.566248 ms)
[07/04/2024-21:52:13] [I] Average on 10 runs - GPU latency: 24.3223 ms - Host latency: 24.4544 ms (enqueue 0.556348 ms)
[07/04/2024-21:52:13] [I] Average on 10 runs - GPU latency: 24.3036 ms - Host latency: 24.4417 ms (enqueue 0.586023 ms)
[07/04/2024-21:52:13] [I] Average on 10 runs - GPU latency: 24.3649 ms - Host latency: 24.4907 ms (enqueue 0.547034 ms)
[07/04/2024-21:52:13] [I] Average on 10 runs - GPU latency: 24.3579 ms - Host latency: 24.4905 ms (enqueue 0.586182 ms)
[07/04/2024-21:52:13] [I] Average on 10 runs - GPU latency: 24.3458 ms - Host latency: 24.4832 ms (enqueue 0.611108 ms)
[07/04/2024-21:52:13] [I] Average on 10 runs - GPU latency: 24.299 ms - Host latency: 24.4352 ms (enqueue 0.610498 ms)
[07/04/2024-21:52:13] [I] Average on 10 runs - GPU latency: 24.3276 ms - Host latency: 24.4605 ms (enqueue 0.598096 ms)
[07/04/2024-21:52:13] [I]
[07/04/2024-21:52:13] [I] === Performance summary ===
[07/04/2024-21:52:13] [I] Throughput: 40.6318 qps
[07/04/2024-21:52:13] [I] Latency: min = 24.3394 ms, max = 24.8159 ms, mean = 24.5329 ms, median = 24.5139 ms, percentile(90%) = 24.6804 ms, percentile(95%) = 24.7084 ms, percentile(99%) = 24.7805 ms
[07/04/2024-21:52:13] [I] Enqueue Time: min = 0.428223 ms, max = 0.770264 ms, mean = 0.57711 ms, median = 0.578491 ms, percentile(90%) = 0.675781 ms, percentile(95%) = 0.706543 ms, percentile(99%) = 0.761963 ms
[07/04/2024-21:52:13] [I] H2D Latency: min = 0.072998 ms, max = 0.156006 ms, mean = 0.0908423 ms, median = 0.0830078 ms, percentile(90%) = 0.110352 ms, percentile(95%) = 0.119385 ms, percentile(99%) = 0.129883 ms
[07/04/2024-21:52:13] [I] GPU Compute Time: min = 24.2186 ms, max = 24.6702 ms, mean = 24.3984 ms, median = 24.3866 ms, percentile(90%) = 24.5381 ms, percentile(95%) = 24.5883 ms, percentile(99%) = 24.666 ms
[07/04/2024-21:52:13] [I] D2H Latency: min = 0.0305176 ms, max = 0.0925293 ms, mean = 0.0436622 ms, median = 0.0402832 ms, percentile(90%) = 0.0651855 ms, percentile(95%) = 0.0683594 ms, percentile(99%) = 0.0903931 ms
[07/04/2024-21:52:13] [I] Total Host Walltime: 3.07641 s
[07/04/2024-21:52:13] [I] Total GPU Compute Time: 3.0498 s
[07/04/2024-21:52:13] [I] Explanations of the performance metrics are printed in the verbose logs.
[07/04/2024-21:52:13] [I]
&&&& PASSED TensorRT.trtexec [TensorRT v100100] # trtexec --onnx=./bottle/padim_fp32_onnx/weights/onnx/model.onnx --saveEngine=./bottle/padim_fp32_trt.engine
In [ ]:
# Update metadata for TensorRT model
metadata_onnx = json.load(open(f"./{category}/padim_fp32_onnx/weights/onnx/metadata.json"))
metadata_padim = json.load(open("metadata_padim.json"))
metadata_padim["image_threshold"] = metadata_onnx["image_threshold"]
metadata_padim["pixel_threshold"] = metadata_onnx["pixel_threshold"]
metadata_padim["min"] = metadata_onnx["min"]
metadata_padim["max"] = metadata_onnx["max"]
with open("metadata_padim.json", 'w') as f:
json.dump(metadata_padim, f, indent=4)
# Inference TensorRT model
inferencer_trt = TrtInferencer(
path=f"./{category}/padim_fp32_trt.engine",
metadata=f"metadata_padim.json",
task=task
)
data = [str(img) for img in inference_path.glob('*.png')]
output_path_trt, inference_time_trt, memory_trt, trt_output_images = test_model(inferencer_trt, data, 'trt')
print(f"Inference time: {inference_time_trt:.2f} seconds, Memory: {memory_trt/1024:.2f} GB")
Image.open(output_path_trt)
Reading metadata from file metadata_padim.json...
metadata: {'task': 'segmentation', 'transform': {'__version__': '1.3.1', 'transform': {'__class_fullname__': 'Compose', 'p': 1.0, 'transforms': [{'__class_fullname__': 'Resize', 'always_apply': True, 'p': 1, 'height': 256, 'width': 256, 'interpolation': 1}, {'__class_fullname__': 'Normalize', 'always_apply': False, 'p': 1.0, 'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225], 'max_pixel_value': 255.0}, {'__class_fullname__': 'ToTensorV2', 'always_apply': True, 'p': 1.0, 'transpose_mask': False}], 'bbox_params': None, 'keypoint_params': None, 'additional_targets': {'image': 'image', 'depth_image': 'image'}, 'is_check_shapes': True}}, 'image_threshold': 206.597412109375, 'pixel_threshold': 170.80052185058594, 'min': 1.7707254886627197, 'max': 687.2928466796875}
inference batchsize = 1
Reading engine from file ./bottle/padim_fp32_trt.engine...
warm up finished...
Average pixel-level F1 score: 0.7262067198753357
Average image-level F1 score: 0.9472300410270691
Average pixel-level AUROC: 0.9288350343704224
Average image-level AUROC: 0.9530868530273438
Inference time: 0.47 seconds, Memory: 5.46 GB
Out[ ]:
In [ ]:
# Comparison of the inference time and memory usage
inference_times = [inference_time_FP32, inference_time_fp16, inference_time_trt]
memory_usage = [memory_FP32 / 1024**3, memory_fp16 / 1024**3, memory_trt / 1024] # Convert to GB
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].bar(['FP32', 'FP16', 'TensorRT'], inference_times)
ax[0].set_title('Inference Time (GPU)')
ax[0].set_ylabel('Time (s)')
ax[1].bar(['FP32', 'FP16', 'TensorRT'], memory_usage)
ax[1].set_title('Memory Usage (VRAM)')
ax[1].set_ylabel('Memory (GB)')
for i, v in enumerate(inference_times):
ax[0].text(i, v, str(round(v, 2)), ha='center', va='bottom')
for i, v in enumerate(memory_usage):
ax[1].text(i, v, str(round(v, 2)), ha='center', va='bottom')
plt.tight_layout()
output = Path(f"./{category}/inference_comparison.png")
plt.savefig(output)
Image.open(output)
Out[ ]:
In [ ]:
# Percentage improvement in inference time and memory usage with TensorRT
inference_time_improvement = (inference_time_FP32 - inference_time_trt) / inference_time_FP32 * 100
memory_improvement = (memory_usage[0] - memory_usage[2]) / memory_usage[0] * 100
print(f"Inference time improvement with TensorRT: {inference_time_improvement:.2f}%")
print(f"Memory usage improvement with TensorRT: {memory_improvement:.2f}%")
Inference time improvement with TensorRT: 78.89% Memory usage improvement with TensorRT: -13.02%
In [ ]:
# Ccomparison of the inference time and memory usage using a line plot
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.plot(['FP32', 'FP16', 'TensorRT'], inference_times, label='Inference Time (GPU)', marker='o')
ax.set_ylabel('Time (s)')
ax.set_xlabel('Inference Type')
ax2 = ax.twinx()
ax2.plot(['FP32', 'FP16', 'TensorRT'], memory_usage, label='Memory Usage (VRAM)', marker='o', color='red')
ax2.set_ylabel('Memory (GB)')
plt.tight_layout()
fig.legend(loc=(0.7, 0.85))
output = Path(f"./{category}/inference_comparison_line.png")
plt.savefig(output)
Image.open(output)
Out[ ]:
In [ ]:
# Comparison between ONNX and OpenVINO
inference_times_2 = [inference_time_onnx, inference_time_openvino]
memory_usage_2 = [memory_onnx / 1024**3, memory_openvino / 1024**3] # Convert to GB
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].bar(['ONNX', 'OpenVINO'], inference_times_2)
ax[0].set_title('Inference Time (CPU)')
ax[0].set_ylabel('Time (s)')
ax[1].bar(['ONNX', 'OpenVINO'], memory_usage_2)
ax[1].set_title('Memory Usage (RAM)')
ax[1].set_ylabel('Memory (GB)')
for i, v in enumerate(inference_times_2):
ax[0].text(i, v, str(round(v, 2)), ha='center', va='bottom')
for i, v in enumerate(memory_usage_2):
ax[1].text(i, v, str(round(v, 2)), ha='center', va='bottom')
plt.tight_layout()
output = Path(f"./{category}/onnx_openvino_comparison.png")
plt.savefig(output)
Image.open(output)
Out[ ]:
In [ ]:
# Comparison of the inference time and memory usage using a line plot
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.plot(['ONNX', 'OpenVINO'], inference_times_2, label='Inference Time (CPU)', marker='o')
ax.set_ylabel('Time (s)')
ax.set_xlabel('Inference Type')
ax.set_ylim(min(inference_times_2) - 0.05, max(inference_times_2) + 0.05)
ax2 = ax.twinx()
ax2.plot(['ONNX', 'OpenVINO'], memory_usage_2, label='Memory Usage (RAM)', marker='o', color='red')
ax2.set_ylabel('Memory (GB)')
ax2.set_ylim(min(memory_usage_2) - 1, max(memory_usage_2) + 1)
plt.tight_layout()
fig.legend(loc=(0.7, 0.85))
output = Path(f"./{category}/onnx_openvino_comparison_line.png")
plt.savefig(output)
Image.open(output)
Out[ ]:
In [ ]:
def dice_coefficient(gt_img, pred_img):
binary_gt = np.where(gt_img > 0, 1, 0).astype(np.uint8)
binary_pred = np.where(pred_img > 0, 1, 0).astype(np.uint8)
intersection = np.logical_and(binary_gt, binary_pred)
dice = (2. * intersection.sum()) / (binary_gt.sum() + binary_pred.sum())
return dice
def auroc_pixel_level(gt_img, pred_img):
auroc = AUROC()
pred_mask = torch.from_numpy(pred_img)
pred_mask = torch.where(pred_mask > 0, torch.tensor(1), torch.tensor(0))
gt_mask = torch.from_numpy(gt_img)
gt_mask = torch.where(gt_mask > 0, torch.tensor(1), torch.tensor(0))
pred_mask_flat = pred_mask.view(-1).float()
gt_mask_flat = gt_mask.view(-1).long()
pixel_lvl_auroc = auroc(pred_mask_flat, gt_mask_flat)
return pixel_lvl_auroc.item()
# Comparison of the segmentation results of all the images for each inference type
segmentation_results = [fp32_output_images, fp_16_output_images, onnx_output_images, openvino_output_images, trt_output_images]
segmentation_results = [segmentation_results[col][row][1].segmentations for row in range(len(segmentation_results[0])) for col in range(len(segmentation_results))]
inferences = ['FP32', 'FP16', 'ONNX', 'OpenVINO', 'TensorRT']
# Create a 5x5 grid of images for comparison
fig, ax = plt.subplots(5, 5, figsize=(20, 20))
# Plot the images row by row
k = 0
for i in range(5):
for j in range(5):
ax[i, j].imshow(segmentation_results[k + j])
ax[i, j].axis('off')
ax[i, j].set_title(inferences[j])
if j != 0:
dice = dice_coefficient(segmentation_results[k], segmentation_results[k + j])
auroc = auroc_pixel_level(segmentation_results[k], segmentation_results[k + j])
ax[i, j].set_title(f'{inferences[j]}\nDice: {dice:.2f}\nAUROC: {auroc:.2f}')
k = (j + 1) * (i + 1)
fig.tight_layout()
output_path = Path(f"./{category}/segmentation_comparison.png")
plt.savefig(output_path)
Image.open(output_path)
Out[ ]:
In [ ]: